# This code chunk simply makes sure that all the libraries used here are installed.
packages <- c("knitr","dplyr", "tidyr", "caret", "ggplot2", "plotly","lubridate","leaflet", "stringr")
if ( length(missing_pkgs <- setdiff(packages, rownames(installed.packages()))) > 0) {
message("Installing missing package(s): ", paste(missing_pkgs, collapse = ", "))
install.packages(missing_pkgs)
}
We will combine the Issue.Date and the Issue.Time into it’s own column. To do this we will need to restucture the data into a time series friendly format.
Now there is time stamp on all the dates, “T00:00:00”, that needs to be removed.
#Removing excess information
FTP$Issue.Date <- sub("T.*", "", FTP$Issue.Date)
We will use some string processing techniqes to clean up our Issue.Time column
Let’s take a look at the data and see if there is anything else we can wrangle
We can see in Latitude and Longitude a default value of 99999 that will need to be removed.
##We can notice that there is a default of 99999 when a cordinate isn't entered. We will remove these
FTP<- FTP %>%
filter(Latitude != 99999)
Now for converting the cordinates from US feet to Logitutde and Latitude cordinates
#Create projection element to convert from US Feet coordinates to normal lat lon
pj <- "+proj=lcc +lat_1=34.03333333333333 +lat_2=35.46666666666667 +lat_0=33.5 +lon_0=-118 +x_0=2000000 +y_0=500000.0000000002 +ellps=GRS80 +datum=NAD83 +to_meter=0.3048006096012192 no_defs"
#Add converted latitude longitude to FTP dataframe
FTP<- cbind(FTP, data.frame(project(data.frame(FTP$Latitude, FTP$Longitude), proj = pj, inverse = TRUE)))
str(FTP)
## 'data.frame': 130342 obs. of 12 variables:
## $ Ticket.number : num 4.34e+09 4.34e+09 4.34e+09 4.34e+09 4.34e+09 ...
## $ Issue.Date : chr "2018-12-23" "2018-12-23" "2018-12-23" "2018-12-23" ...
## $ Issue.time : chr "8:30" "8:36" "8:40" "8:41" ...
## $ Route : chr "340R" "340R" "340R" "340R" ...
## $ Agency : int 53 53 53 53 53 53 53 55 55 55 ...
## $ Violation.code : chr "80.73.2" "80.56E4+" "80.69B" "80.69B" ...
## $ Violation.Description: chr "EXCEED 72HRS-ST" "RED ZONE" "NO PARKING" "NO PARKING" ...
## $ Fine.amount : int 68 93 73 73 73 73 68 363 68 73 ...
## $ Latitude : num 6444107 6442254 6444274 6444274 6444274 ...
## $ Longitude : num 1906219 1905504 1907268 1907268 1907268 ...
## $ x : num -118 -118 -118 -118 -118 ...
## $ y : num 34.2 34.2 34.2 34.2 34.2 ...
FTP <- FTP[-9:-10] #This removes the Latitude and Logitude in Feet from our table
names(FTP)[c(9, 10)] <- c('Longitude', 'Latitude') #Rename column names of converted longitude latitude
# Now our data is looking clean and usable
summary(FTP)
## Ticket.number Issue.Date Issue.time
## Min. :1.068e+09 Length:130342 Length:130342
## 1st Qu.:4.346e+09 Class :character Class :character
## Median :4.346e+09 Mode :character Mode :character
## Mean :4.292e+09
## 3rd Qu.:4.347e+09
## Max. :4.348e+09
##
## Route Agency Violation.code
## Length:130342 Min. : 1.00 Length:130342
## Class :character 1st Qu.:53.00 Class :character
## Mode :character Median :54.00 Mode :character
## Mean :53.16
## 3rd Qu.:55.00
## Max. :58.00
##
## Violation.Description Fine.amount Longitude Latitude
## Length:130342 Min. : 25.00 Min. :-118.7 Min. :33.71
## Class :character 1st Qu.: 63.00 1st Qu.:-118.4 1st Qu.:34.04
## Mode :character Median : 73.00 Median :-118.3 Median :34.06
## Mean : 70.77 Mean :-118.4 Mean :34.08
## 3rd Qu.: 73.00 3rd Qu.:-118.3 3rd Qu.:34.11
## Max. :363.00 Max. :-118.2 Max. :34.33
## NA's :27
After all the cleaning of our data, now we can format date and time, so we can better work with it in R
FTP$Date <- as.POSIXlt(paste(FTP$Issue.Date, FTP$Issue.time), format="%Y-%m-%d %H:%M")
Now lets identify the weekday each ticket was given and put it into a column
FTP$Weekdays <- weekdays(FTP$Date)
We can also store the hour of the day that the tickets were given and add a column
FTP$Hour <- FTP$Date$hour
summary(FTP)
## Ticket.number Issue.Date Issue.time
## Min. :1.068e+09 Length:130342 Length:130342
## 1st Qu.:4.346e+09 Class :character Class :character
## Median :4.346e+09 Mode :character Mode :character
## Mean :4.292e+09
## 3rd Qu.:4.347e+09
## Max. :4.348e+09
##
## Route Agency Violation.code
## Length:130342 Min. : 1.00 Length:130342
## Class :character 1st Qu.:53.00 Class :character
## Mode :character Median :54.00 Mode :character
## Mean :53.16
## 3rd Qu.:55.00
## Max. :58.00
##
## Violation.Description Fine.amount Longitude Latitude
## Length:130342 Min. : 25.00 Min. :-118.7 Min. :33.71
## Class :character 1st Qu.: 63.00 1st Qu.:-118.4 1st Qu.:34.04
## Mode :character Median : 73.00 Median :-118.3 Median :34.06
## Mean : 70.77 Mean :-118.4 Mean :34.08
## 3rd Qu.: 73.00 3rd Qu.:-118.3 3rd Qu.:34.11
## Max. :363.00 Max. :-118.2 Max. :34.33
## NA's :27
## Date Weekdays Hour
## Min. :2018-12-23 00:02:00 Length:130342 Min. : 0.00
## 1st Qu.:2019-01-01 08:40:00 Class :character 1st Qu.: 8.00
## Median :2019-01-08 15:40:00 Mode :character Median :11.00
## Mean :2019-01-08 08:29:28 Mean :11.29
## 3rd Qu.:2019-01-15 14:35:00 3rd Qu.:14.00
## Max. :2019-01-23 23:57:00 Max. :23.00
## NA's :17 NA's :17
sum(is.na(FTP)) #check for how many NAs there are
## [1] 78
FTP <- na.omit(FTP)# Very few for this many observation (less than 1%)
FTP <- FTP[-11]
summary(FTP)
## Ticket.number Issue.Date Issue.time
## Min. :1.068e+09 Length:130298 Length:130298
## 1st Qu.:4.346e+09 Class :character Class :character
## Median :4.346e+09 Mode :character Mode :character
## Mean :4.293e+09
## 3rd Qu.:4.347e+09
## Max. :4.348e+09
## Route Agency Violation.code
## Length:130298 Min. : 1.00 Length:130298
## Class :character 1st Qu.:53.00 Class :character
## Mode :character Median :54.00 Mode :character
## Mean :53.17
## 3rd Qu.:55.00
## Max. :58.00
## Violation.Description Fine.amount Longitude Latitude
## Length:130298 Min. : 25.00 Min. :-118.7 Min. :33.71
## Class :character 1st Qu.: 63.00 1st Qu.:-118.4 1st Qu.:34.04
## Mode :character Median : 73.00 Median :-118.3 Median :34.06
## Mean : 70.77 Mean :-118.4 Mean :34.08
## 3rd Qu.: 73.00 3rd Qu.:-118.3 3rd Qu.:34.11
## Max. :363.00 Max. :-118.2 Max. :34.33
## Weekdays Hour
## Length:130298 Min. : 0.00
## Class :character 1st Qu.: 8.00
## Mode :character Median :11.00
## Mean :11.29
## 3rd Qu.:14.00
## Max. :23.00
:Analysing the data First lets see the revenue they generated in a year
revenue <- sum(FTP$Fine.amount)
revenue
## [1] 9221737
Filter top 10 Violations
TopViolations <- FTP %>%
group_by(Violation.Description) %>%
tally() %>%
arrange(-n) %>%
head(10)
TopViolations
Now lets graph top 10 Violations throughout the year
#I need to make this for month and not year
TopViolationsLastYears <- FTP %>%
filter(Violation.Description %in%
TopViolations$Violation.Description)
p <- ggplot(TopViolationsLastYears, aes(Issue.Date)) +
geom_bar(aes(fill=Violation.Description), stat='count')
#Plot the data), stat='count')
ggplotly(p)
Lets see if we can find some more patterns in the data
#This one would be better for a month
DailyParkingViolation <- FTP %>%
group_by(Issue.Date) %>%
tally() %>%
ggplot(aes(x=Issue.Date, y=n)) +
geom_point()
DailyParkingViolation
It appears there is a cloud of data points towards the top and the bottom of the graph. That is interesting, and we will further need to investigate what this could be.
table(FTP$Weekday)
##
## Friday Monday Saturday Sunday Thursday Tuesday Wednesday
## 17920 23383 2874 7501 23316 22381 32923
table(FTP$Hour)
##
## 0 1 2 3 4 5 6 7 8 9 10 11
## 1438 3256 3115 1990 1579 1295 1729 2490 17109 8968 16545 11670
## 12 13 14 15 16 17 18 19 20 21 22 23
## 16307 7935 5602 3833 6012 4222 4249 3642 2330 1785 2029 1168
We will save this table as a data frame:
WeekdayCounts = as.data.frame(table(FTP$Weekday))
Create our plot
ggplot(WeekdayCounts, aes(x=Var1, y=Freq)) + geom_line(aes(group=1))
It will be easier to understand the data if the days are in order Lets lable our X and Y axis:
WeekdayCounts$Var1 = factor(WeekdayCounts$Var1, ordered=TRUE, levels=c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday","Saturday")) #We can change the Var1 variable to be an ordered factor variable
ggplot(WeekdayCounts, aes(x=Var1, y=Freq)) + geom_line(aes(group=1)) +
xlab("Day of the Week") + ylab("Total Ticket Given Out")+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
Adding the Hour of the Day
Create a counts table for the weekday and hour:
table(FTP$Weekday, FTP$Hour)
##
## 0 1 2 3 4 5 6 7 8 9 10 11
## Friday 199 388 361 224 238 139 232 310 2664 1085 2483 1544
## Monday 252 609 587 399 139 215 303 444 2716 1776 2978 2415
## Saturday 97 129 25 19 38 31 14 23 71 117 132 165
## Sunday 263 411 510 365 163 193 244 164 414 420 329 535
## Thursday 152 461 511 306 272 165 269 491 3296 1877 3223 2066
## Tuesday 240 521 491 371 243 233 309 453 2810 1455 3018 1932
## Wednesday 235 737 630 306 486 319 358 605 5138 2238 4382 3013
##
## 12 13 14 15 16 17 18 19 20 21 22 23
## Friday 2514 977 784 593 738 485 483 476 323 263 288 129
## Monday 2947 1512 978 738 1177 774 749 639 294 255 318 169
## Saturday 160 95 107 67 222 253 274 213 164 103 196 159
## Sunday 385 373 325 142 403 317 401 305 333 202 165 139
## Thursday 3014 1569 1018 728 1040 725 682 426 316 266 263 180
## Tuesday 3159 1339 1035 666 950 702 659 575 410 295 334 181
## Wednesday 4128 2070 1355 899 1482 966 1001 1008 490 401 465 211
We will save this as a data frame
DayHourCounts = as.data.frame(table(FTP$Weekday, FTP$Hour))
DayHourCounts$Hour = as.numeric(as.character(DayHourCounts$Var2))# Convert the second variable, Var2, to numbers and call it Hour:
ggplot(DayHourCounts, aes(x=Hour, y=Freq)) + geom_line(aes(group=Var1))# Create out plot:
# Fix the order of the days:
ggplot(DayHourCounts, aes(x=Hour, y=Freq)) + geom_line(aes(group=Var1, color=Var1), size=2)
DayHourCounts$Type = ifelse((DayHourCounts$Var1 == "Sunday") | (DayHourCounts$Var1 == "Saturday"),
"Weekend", "Weekday")
# Redo our plot, this time coloring by Type:
ggplot(DayHourCounts, aes(x=Hour, y=Freq)) + geom_line(aes(group=Var1, color=Type), size=2, alpha=0.5)
# Fix the order of the days:
DayHourCounts$Var1 = factor(DayHourCounts$Var1, ordered=TRUE, levels=c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))
ggplot(DayHourCounts, aes(x = Hour, y = Var1)) + geom_tile(aes(fill = Freq))
ggplot(DayHourCounts, aes(x = Hour, y = Var1)) + geom_tile(aes(fill = Freq)) +
scale_fill_gradient(name="Total Tickets Given") +
theme(axis.title.y = element_blank())
ggplot(DayHourCounts, aes(x = Hour, y = Var1)) + geom_tile(aes(fill = Freq)) +
scale_fill_gradient(name="Total Tickets Given", low="white", high="red") +
theme(axis.title.y = element_blank())
HotSpot1 <- FTP %>% filter(Violation.Description == c("NO PARK/STREET CLEAN","METER EXP.", "RED ZONE", "PREFERENTIAL PARKING",
"DISPLAY OF TABS", "NO PARKING","DISPLAY OF PLATES","PARKED OVER TIME LIMIT", "WHITE ZONE","NO STOP/STANDING",
"BLOCKING DRIVEWAY","STANDNG IN ALLEY"))
## Warning in Violation.Description == c("NO PARK/STREET CLEAN", "METER
## EXP.", : longer object length is not a multiple of shorter object length
table(HotSpot1$Violation.Description)
##
## BLOCKING DRIVEWAY DISPLAY OF PLATES DISPLAY OF TABS
## 171 319 642
## METER EXP. NO PARK/STREET CLEAN NO PARKING
## 1747 3599 401
## NO STOP/STANDING PARKED OVER TIME LIMIT PREFERENTIAL PARKING
## 231 291 801
## RED ZONE STANDNG IN ALLEY WHITE ZONE
## 927 131 222
# This is my reactive map # Click on the number and the map will show you
# I think I should make this just for a month
leaflet(data = HotSpot1) %>% addTiles() %>% addMarkers(
~Longitude, ~Latitude,clusterOptions = markerClusterOptions()
)